import json
from concurrent.futures import Executor, ThreadPoolExecutor
from functools import partial
from typing import Dict, List, Literal

import requests
from bs4 import BeautifulSoup
from codetiming import Timer
from tqdm import tqdm

headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.76"
}


def getDocTableById(data_docId: str) -> Dict[str, str]:
    url = f"https://www.cbirc.gov.cn/cn/static/data/DocInfo/SelectByDocId/{data_docId=}.json"
    response = requests.get(url, headers=headers)
    data = response.json()
    soup = BeautifulSoup(data["data"]["docClob"], "html.parser")
    table: Dict[str, str] = {}
    for tr in soup.select("tr"):
        key = tr.select_one("td:nth-child(1)").text.strip()
        val_elms = tr.select_one("td:nth-child(2)").select("p")
        if val_elms is None:
            table[key] = ""
            continue
        value = [p.text for p in tr.select_one("td:nth-child(2)").select("p")]
        table[key] = value[0] if len(value) == 1 else ";".join(value)
    return table


APIName = Literal["SelectDocItemByItemPId", "SelectDocByItemIdAndChild"]


def getDocIds(
    pageIndex: int,
    pageSize: int = 18,
    itemId: int = 1855,
    apiName: APIName = "SelectDocItemByItemPId",
) -> List[str]:
    url = f"https://www.cbirc.gov.cn/cbircweb/DocInfo/{apiName}?{itemId=}&{pageSize=}&{pageIndex=}"
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        return []
    data = response.json()
    return list(map(lambda x: x["docId"], data["data"]["rows"]))


def getDocTables(
    executor: Executor,
    nPages: int = 8,
    pageSize: int = 18,
    itemId: int = 4114,
    apiName: APIName = "SelectDocByItemIdAndChild",
) -> List[Dict[str, str]]:
    """
    获取文档表格内容

    参数
    ----
        - executor: 执行器 (Executor 类型)
        - nPages: 页数 (int 类型，默认值: 8)
        - pageSize: 每页个数 (int 类型，默认值: 18)
        - itemId: 项目编号 (int 类型，默认值: 4114)
        - apiName: API 名称 (APIName 类型，默认值: "SelectDocByItemIdAndChild")

    返回
    ----
        - tables: 表格内容 (List[Dict[str, str]] 类型)

    """
    pageIndices = range(1, nPages + 1)
    print("正在获取文号列表...")
    docIds = tqdm(
        executor.map(
            partial(getDocIds, pageSize=pageSize, itemId=itemId, apiName=apiName),
            pageIndices,
        ),
        total=nPages,
    )
    docIds = sum(docIds, [])
    print("获取文号列表完成.")
    print("正在获取文档表格内容...")
    tables = list(tqdm(executor.map(getDocTableById, docIds), total=len(docIds)))
    print("获取文档表格内容完成.")
    return tables


if __name__ == "__main__":
    with ThreadPoolExecutor() as executor, Timer(text="程序耗时: {:.2f}s"):
        tables = getDocTables(executor, apiName="SelectDocItemByItemPId", itemId=1855)
        # tables = get_tables(executor, apiName="SelectDocByItemIdAndChild", itemId=4114)

    with open("data/京金罚决字.json", "w", encoding="utf-8") as f:
        json.dump(tables, f, ensure_ascii=False)

    # with open("data/省金罚决字.json", "w", encoding="utf-8") as f:
    #     json.dump(tables, f, ensure_ascii=False)
